import chardet
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
# Specify the path to the CSV file
csv_file = "C:\datasets\Auto Dataset.csv"
# Detect the encoding of the CSV file
with open(csv_file, 'rb') as f:
result = chardet.detect(f.read())
# Print the detected encoding
print(result['encoding'])
Windows-1252
# Specify the path to the CSV file
csv_file = "C:\datasets\Auto Dataset.csv"
# Read the CSV file with the detected encoding
df = pd.read_csv(csv_file, encoding='Windows-1252')
print(df.head())
# Check the column names and data types
print(df.columns)
print(df.dtypes)
dateCrawled name seller \
0 3/26/2016 17:47 Peugeot_807_160_NAVTECH_ON_BOARD privat
1 4/4/2016 13:38 BMW_740i_4_4_Liter_HAMANN_UMBAU_Mega_Optik privat
2 3/26/2016 18:57 Volkswagen_Golf_1.6_United privat
3 3/12/2016 16:58 Smart_smart_fortwo_coupe_softouch/F1/Klima/Pan... privat
4 4/1/2016 14:38 Ford_Focus_1_6_Benzin_TÜV_neu_ist_sehr_gepfleg... privat
offerType price abtest vehicleType yearOfRegistration gearbox \
0 Angebot $5,000 control bus 2004 manuell
1 Angebot $8,500 control limousine 1997 automatik
2 Angebot $8,990 test limousine 2009 manuell
3 Angebot $4,350 control kleinwagen 2007 automatik
4 Angebot $1,350 test kombi 2003 manuell
powerPS model odometer monthOfRegistration fuelType brand \
0 158 andere 150,000km 3 lpg peugeot
1 286 7er 150,000km 6 benzin bmw
2 102 golf 70,000km 7 benzin volkswagen
3 71 fortwo 70,000km 6 benzin smart
4 0 focus 150,000km 7 benzin ford
notRepairedDamage dateCreated nrOfPictures postalCode lastSeen
0 nein 3/26/2016 0:00 0 79588 4/6/2016 6:45
1 nein 4/4/2016 0:00 0 71034 4/6/2016 14:45
2 nein 3/26/2016 0:00 0 35394 4/6/2016 20:15
3 nein 3/12/2016 0:00 0 33729 3/15/2016 3:16
4 nein 4/1/2016 0:00 0 39218 4/1/2016 14:38
Index(['dateCrawled', 'name', 'seller', 'offerType', 'price', 'abtest',
'vehicleType', 'yearOfRegistration', 'gearbox', 'powerPS', 'model',
'odometer', 'monthOfRegistration', 'fuelType', 'brand',
'notRepairedDamage', 'dateCreated', 'nrOfPictures', 'postalCode',
'lastSeen'],
dtype='object')
dateCrawled object
name object
seller object
offerType object
price object
abtest object
vehicleType object
yearOfRegistration int64
gearbox object
powerPS int64
model object
odometer object
monthOfRegistration int64
fuelType object
brand object
notRepairedDamage object
dateCreated object
nrOfPictures int64
postalCode int64
lastSeen object
dtype: object
# Convert the 'price' column to numeric by removing characters like '$' and ','
df['price'] = df['price'].replace('[\$,]', '', regex=True).astype(float)
# Drop irrelevant columns and columns with non-numeric or date-like data
drop_columns = ['dateCrawled', 'name', 'seller', 'offerType', 'abtest', 'model', 'lastSeen']
df.drop(columns=drop_columns, inplace=True)
# Convert categorical columns to numerical using one-hot encoding
df = pd.get_dummies(df, drop_first=True)
# Split the dataset into features (X) and target (y)
X = df.drop(columns=['price'])
y = df['price']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.8012865492695206 Accuracy on testing dataset -14.141918456127756
model = RandomForestRegressor(n_estimators=100, random_state=0, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.001008273339773269 Accuracy on testing dataset -0.00721577271035545
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)
Mean Squared Error: 14255937322.490925 R2 Score: -0.00721577271035545
print("Feature importance: \n", model.feature_importances_)
Feature importance: [0.24 0.22 0. 0. 0.03 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.1 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.01 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.38 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.02 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. ]
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()
# Split the dataset into features (X) and target (y)
X = df.drop(columns=['price'])
y = df['price']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create and train the Gradient Boosting Regressor
model = GradientBoostingRegressor(n_estimators=100, random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.8424825900005556 Accuracy on testing dataset -0.9874505787618353
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("Mean Squared Error:", mse)
print("R2 Score:", r2)
Mean Squared Error: 28129991259.107037 R2 Score: -0.9874505787618353
print("Feature importance: \n", model.feature_importances_)
Feature importance: [6.77623690e-02 9.89112861e-02 2.41586681e-03 0.00000000e+00 5.34428542e-02 0.00000000e+00 0.00000000e+00 3.01329825e-06 0.00000000e+00 2.27272565e-05 7.21735076e-03 0.00000000e+00 9.40271086e-05 0.00000000e+00 0.00000000e+00 7.18401207e-05 0.00000000e+00 0.00000000e+00 2.94354509e-02 0.00000000e+00 1.55252966e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 2.69450391e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 6.46169790e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.32992764e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.54330198e-04 0.00000000e+00 2.00704087e-04 0.00000000e+00 4.31511406e-03 0.00000000e+00 3.61924091e-05 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 8.42617142e-06 1.75398241e-05 0.00000000e+00 0.00000000e+00 7.17988432e-11 5.60342886e-04 4.56721463e-03 1.62381015e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 1.84629629e-04 0.00000000e+00 0.00000000e+00 4.38499090e-10 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 6.89695123e-01 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 9.67802968e-04 0.00000000e+00 0.00000000e+00 7.86742666e-06 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 8.09185939e-03 3.53930637e-03 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()
# Create and train the Decision Tree Classifier
model = DecisionTreeRegressor(random_state=0)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.9999999995329204 Accuracy on testing dataset -1.0779170076297566
# Create and train the Decision Tree Classifier
model = DecisionTreeRegressor(random_state=0, max_depth=1)
model.fit(X_train, y_train)
# Make predictions on the test set
print("Accuracy on training dataset",model.score(X_train, y_train))
print("Accuracy on testing dataset",model.score(X_test, y_test))
Accuracy on training dataset 0.00065387246845372 Accuracy on testing dataset -0.004053662111415779
print("Feature importance: \n", model.feature_importances_)
Feature importance: [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
from sklearn import tree
fig = plt.figure(figsize=(25, 15))
_ = tree.plot_tree(model, filled=True, rounded=True,
feature_names=X.columns,
class_names=["price"])
import matplotlib.pyplot as plt
import numpy as np
# Create a figure and axis
fig, ax = plt.subplots(figsize=(150, 100))
# Get the number of features
n_features = X_train.shape[1]
# Plot the feature importances
ax.barh(range(n_features), model.feature_importances_, align="center")
# Set the y-ticks to be the feature names
ax.set_yticks(np.arange(n_features))
ax.set_yticklabels(X.columns)
# Label the x-axis
ax.set_xlabel('Feature Importance')
# Show the plot
plt.show()